{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# UMich EECS 445 Project 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import itertools\n",
    "\n",
    "import string as s\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.svm import SVC\n",
    "from sklearn import metrics\n",
    "from matplotlib import pyplot as plt;\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "500"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load_data\n",
    "\n",
    "def load_data(fname):\n",
    "  \"\"\"\n",
    "  Reads in a csv file and return a dataframe. A dataframe df is similar to dictionary.\n",
    "  You can access the label by calling df['label'], the content by df['content']\n",
    "  the sentiment by df['sentiment']\n",
    "  \"\"\"\n",
    "  return pd.read_csv(fname)\n",
    "\n",
    "data = load_data('dataset.csv')\n",
    "#data['label'], data['content']\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2201"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def extract_dictionary(df):\n",
    "    \"\"\"\n",
    "    Reads a panda dataframe, and returns a dictionary of distinct words\n",
    "        mapping from the distinct word to its index (ordered by when it was found).\n",
    "    Input:\n",
    "        dataframe/output of load_data()        \n",
    "    Returns: \n",
    "        a dictionary of distinct words\n",
    "        mapping from the distinct word to its index (ordered by when it was found).\n",
    "    \"\"\"\n",
    "    transtable = str.maketrans(s.punctuation, ' '*len(s.punctuation))\n",
    "    word_dict = {}\n",
    "    index = -1\n",
    "    for i in range(len(data)):\n",
    "        post = data['content'][i]\n",
    "        post = post.translate(transtable)\n",
    "        post = post.split()\n",
    "        for word in post:\n",
    "            word = word.lower()\n",
    "            if word not in word_dict:\n",
    "                index += 1\n",
    "                word_dict[word] = index\n",
    "                \n",
    "    return word_dict\n",
    "    \n",
    "word_dict = extract_dictionary(data)\n",
    "len(word_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(500, 2201)"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def generate_feature_matrix(df, word_dict):\n",
    "    \"\"\"\n",
    "    Reads a dataframe and the dictionary of words in the reviews\n",
    "        to generate {1, 0} feature vectors for each review. The resulting feature\n",
    "        matrix should be of dimension (number of tweets, number of words).\n",
    "    Input:\n",
    "        df - dataframe that has the tweets and labels\n",
    "        word_list- dictionary of words mapping to indices\n",
    "    Returns: \n",
    "        a feature matrix of dimension (number of tweets, number of words)\n",
    "    \"\"\"\n",
    "    fmatrix = np.zeros((len(df), len(word_dict)))\n",
    "    transtable = str.maketrans(s.punctuation, ' '*len(s.punctuation))\n",
    "    for i in range(len(data)):\n",
    "        post = data['content'][i]\n",
    "        post = post.translate(transtable)\n",
    "        post = post.split()\n",
    "        for word in post:\n",
    "            word = word.lower()\n",
    "            if word in word_dict:\n",
    "                fmatrix[i][word_dict[word]] = 1\n",
    "                \n",
    "    return fmatrix\n",
    "\n",
    "fmatrix = generate_feature_matrix(data, word_dict)\n",
    "fmatrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def generate_label(df):\n",
    "    \"\"\"\n",
    "    Input:\n",
    "        dataFrame by load_data\n",
    "    Output:\n",
    "        numpy.array with length 500. 1 for love, 0 for hate\n",
    "    \"\"\"\n",
    "    label = np.zeros((len(df)))\n",
    "    for i in range(len(df)):\n",
    "        if data['sentiment'][i] == 'love':\n",
    "            label[i] = 1\n",
    "            \n",
    "    return label\n",
    "        \n",
    "label = generate_label(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.71201720172017191"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def cv_performance(clf, X, y, k=5, metric=\"accuracy\"):\n",
    "    \"\"\"\n",
    "    Splits the data, X and y, into k-folds and runs k-fold crossvalidation:\n",
    "        training a classifier on K-1 folds and testing on the remaining fold.\n",
    "        Calculates the k-fold crossvalidation performance metric for classifier\n",
    "        clf by averaging the performance across folds.\n",
    "    Input:\n",
    "        clf - an instance of SVC()\n",
    "        X - (n,d) array of feature vectors, where n is the number of examples\n",
    "           and d is the number of features\n",
    "        y - (n,) array of binary labels {1,-1}\n",
    "        k - int specificyin the number of folds (default=5)\n",
    "        metric - string specifying the performance metric (default='accuracy',\n",
    "            other options: 'f1-score', 'auroc', 'precision', 'sensitivity',\n",
    "            and 'specificity')\n",
    "    Returns: average 'test' performance across the k folds as np.float64\n",
    "    \"\"\"\n",
    "    # todo, only accuracy and no CV now\n",
    "    skf = StratifiedKFold(n_splits=k)\n",
    "    avg_accu = 0\n",
    "    for train_index, test_index in skf.split(X, y):\n",
    "        clf.fit(X[train_index], y[train_index])\n",
    "        predict = clf.predict(X[test_index])\n",
    "        accu = (1 - np.abs(y[test_index] - predict)).sum() / len(test_index)\n",
    "        avg_accu += accu\n",
    "    \n",
    "    return avg_accu / k\n",
    "    \n",
    "cv_performance(SVC(kernel='linear'), fmatrix, label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1000.0"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def select_param_linear(X, y, k=5, metric=\"accuracy\", C_range = [], penalty='l2'):\n",
    "    \"\"\"\n",
    "    Sweeps different settings for the hyperparameter of a linear-kernel SVM,\n",
    "        calculating the k-fold CV performance for each setting on X, y.\n",
    "    Input:\n",
    "        X - (n,d) array of feature vectors, where n is the number of examples\n",
    "            and d is the number of features\n",
    "        y - (n,) array of binary labels {1,-1}\n",
    "        k - int specifying the number of folds (default=5)\n",
    "            metric- string specifying the performance metric (default='accuracy',\n",
    "            other options: 'f1-score', 'auroc', 'precision', 'sensitivity',\n",
    "            and 'specificity')\n",
    "        C_range - an array with all C values to be checked for\n",
    "    Returns the parameter value for linear-kernel SVM, that 'maximizes' the\n",
    "        average 5-fold CV performance.\n",
    "    \"\"\"\n",
    "    max_perf = 0\n",
    "    final_c = 0\n",
    "    for c in C_range:\n",
    "        perf = cv_performance(SVC(kernel='linear', C=c, class_weight='balanced'), X, y, k, metric)\n",
    "        if perf > max_perf:\n",
    "            max_perf = perf\n",
    "            final_c = c\n",
    "            \n",
    "    return final_c\n",
    "\n",
    "C_range = np.array([10**(-3), 1, 10**(3)])\n",
    "select_param_linear(fmatrix, label, C_range=C_range)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
